Text analysis: title and abstract of male and female speakers

Title analysis

data <- read.table("data/presentations_PPGE_2008-2019.csv", sep=",",
                   header=T, as.is=T)
data$date <- dmy(data$date)
data$year <- year(data$date) 
#skimr::skim(data)

Excluding special events as round tables and discussions not related to a project or study presented by someone.

IDs <- c(154, 250, 211, 289)
data <- data %>% filter(!id %in% IDs) %>% filter(!is.na(title_english))
table(data$position_cat, data$gender)
##            
##              F  M
##   others     4  1
##   postdoc   25 32
##   professor 25 75
##   student   88 75

Tidytext

tit <- data %>% dplyr::select(id,gender,position_cat, audience_n,
                              title_english) 

text_tok <- tit %>% unnest_tokens(output=word,
                                   input=title_english)

stopwords - excluir word que não agregam como “and” “or” “the” “of” “in”

# lista das stopwords em ingles
stop_w <- tibble(word = stopwords(source = "stopwords-iso"))

#retirar do corpus as stopwords
text <- text_tok %>% 
  anti_join(stop_w, by="word") 

# retirar nĂșmeros e travessĂŁo e outras word
remover <- c("ăƒŒ", "1", "1st", "2", "364", "40", "70", "750", "aff", "da")

text <- text %>% filter(!word %in% remover )

# resolvendo plurais simples - sĂł cortando o S
plural <- c("actions","advances", "adaptations", "amphibians", "animals", "ants","anurans",
            "applications","approaches", "bees","builds", "birds",
            "cerrados","challenges",
            "continents","crops", 
            "decisions","declines","determines","determinants", "defenses",
            "dynamics",
            "economics", "ecosystems","environments", "experiences",
            "forests",
            "genetics","gifts","gradients","guides","impacts",
            "increases","interactions","lives",
            "landscapes","males","mammals", "mangroves","models","movements",
            "mutualisms","networks","neotropics",
            "opilions","phenotypes","plants","projects","paths", "perspectives",
            "populations","promotes","relationships", "relations",
            "resources","responses","roads","services","skulls","snakes","seeds",
            "spaces", "spiders","stages", "trees", "variations",
            "threats")

text$word[text$word %in% plural] <- 
  substr(text$word[text$word %in% plural],
       1,nchar(text$word[text$word %in% plural])-1)

Agrupando word parecidas

lemma <- rbind(c("adaptive", "adaptation"),
               c("advancement", "advance"),
               c("agricultural", "agriculture"),
               c("agro", "agriculture" ),
               c("amazonia","amazon" ),
               c("amazonian","amazon" ),
               c("andean","andes"),
               c("apply","application"),
               c("applying","application"),
               c("apidae","apis"),
               c("arachnida","arachnid"),
               c("argue","argument"),
               c("basal", "basis"),
               c("behavioral","behavior"),
               c("behavioural","behavior"),
               c("bignonieae", "bignoniaceae"),
               c("biological", "biology"),
               c("brazilian","brazil"),
               c("building","build"),
               c("changing", "change"),
               c("cnidarian", "cnidaria"),
               c("coastal","coast"),
               c("colour", "color"),
               c("colors", "color"),
               c("communities","community" ),
               c("competitive", "competition"),
               c("complexity", "complex"),
               c("convergences", "convergence"),
               c("convergent", "convergence"),
               c("cordatus","cordata" ),
               c("croplands","crop"),
               c( "cultural", "culture"),
               c("darwin's", "darwin"),
               c("darwinian", "darwin"),
               c("defensive", "defense"),
               c("dependent","dependence"),
               c("detecting","detection"),
               c("determine", "determinant"),
               c("developmental", "development"),
               c("dispersers","dispersal"),
               c("disturbed", "disturbance"),
               c("diversification", "diversity"),
               c("dragonflies", "dragonfly"),
               c("drier", "drought"),
               c("ecological", "ecology"),
               c("ecologists", "ecology"),
               c("endemic", "endemism"),
               c("effectiveness", "efficiency"),
               c("environmental", "environment"),
               c("evolutionary", "evolution"),
               c("expanding", "expansion"),
               c("extinct", "extinction"),
               c("facilitate", "facilitation"),
               c("fisheries", "fishery"),
               c("floral", "flora"),
               c("floristic", "flora"),
               c("forested", "forest"),
               c("functional", "function"),
               c("functionally", "function"),
               c("functioning", "function"),
               c("geographical", "geographic"),
               c("heterogeneties", "heterogeneity"),
               c("heterogeneous", "heterogeneity"),
               c("histories", "history"),
               c("integrated", "integration"),
               c("intregating", "integration"),
               c("integrative", "integration"),
               c("invasive", "invasion"),
               c("isotopic", "isotope"),
               c("linking", "link"),
               c("living", "live"),
               c("mammalia", "mammal"),
               c("managed", "manage"),
               c("managers", "manage"),
               c("mathematical", "mathematics"),
               c("mates", "mating"),
               c("mediated", "mediate"),
               c("mechanistic", "mechanism"),
               c("matrices", "matrix"),
               c("migratory", "migration"),
               c("mimicking", "mimicry"),
               c("modeling", "model"),
               c("mutualistic", "mutualism"),
               c("natural", "nature"),
               c("neotropical", "neotropic"),
               c("northeastern", "northeast"),
               c("occuring", "occur"),
               c("onça", "onca"),
               c("opiliones", "opilion"),
               c("parasite", "parasitism"),
               c("parent", "parenting"),
               c("phylogenies", "phylogeny"),
               c("phylogenetic", "phylogeny"),
               c("phylogenomic", "phylogeny"),
               c("pollinators", "pollination"),
               c("protected", "protect"),
               c("protective", "protect"),
               c("rainfall", "rain"),
               c("reconstructing", "reconstruction"),
               c("regulatory", "regulation"),
               c("regulates", "regulation"),
               c("relation", "relationship"),
               c("reproductive", "reproduction"),
               c("restored", "restoration"),
               c("robustness", "robust"),
               c("scientific", "science"),
               c("scientist", "science"),
               c("sexy", "sexual"),
               c("simulated", "simulation"),
               c("societies", "society"),
               c("social", "society"),
               c("socio", "society"),
               c("space", "spatial"),
               c("spacio", "spatial"),
               c("stabilize", "stability"),
               c("stable", "stability"),
               c("stories", "story"),
               c("strategic", "strategy"),
               c("strategies", "strategy"),
               c("structured", "structure"),
               c("structuring", "structure"),
               c("studies", "study"),
               c("studing", "study"),
               c("sustainable", "sustainability"),
               c("theories", "theory"),
               c("theoretical", "theory"),
               c("threatened", "threat"),
               c("tropical", "tropic"),
               c("vision", "visual")
               )
lemma <- as.data.frame(lemma)

for (i in 1:dim(lemma)[1]){
  text$word[text$word == lemma[i,1]] <- lemma[i,2]
}

contando as word

pala <- text %>%
  count(word) 

palavra mais comuns

text %>%
  count(word, sort = TRUE) %>% 
  filter(n>8)%>%
  kable()
word n
ecology 49
forest 42
evolution 32
landscape 27
bird 22
model 22
diversity 21
environment 21
species 21
plant 18
structure 17
atlantic 15
brazil 15
effects 15
conservation 14
interaction 13
bee 12
community 12
network 12
patterns 12
sĂŁo 12
study 12
application 11
behavior 11
dynamic 11
ecosystem 11
paulo 11
population 11
role 11
change 10
male 10
mutualism 10
neotropic 10
pollination 10
science 10
sexual 10
animal 9
biology 9
care 9
cerrado 9
genetic 9
habitat 9
mating 9
opilion 9
selection 9
society 9
props <- text %>%
  count(gender, word) %>%
  #filter(n>1) %>% # removendo word ditas apenas uma vez
  group_by(gender) %>%
  mutate(proportion = n / sum(n)) %>% 
  pivot_wider(names_from = gender, values_from = c(proportion,n))
library(scales)
ggplot(props, aes(x=proportion_M,, y=proportion_F),
       color=abs(proportion_F-proportion_M)) + 
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label=word),check_overlap = TRUE,vjust = 1.5) +
  scale_x_log10(labels = percent_format(), limits=c(0.0005,0.03)) +
  scale_y_log10(labels = percent_format(),limits=c(0.0005,0.03)) +
  scale_color_gradient(low = "blue", high = "red") 

ggplot(props, aes(x=proportion_M, y=proportion_F))+ geom_point(alpha=0.1)+
  geom_abline(color = "gray40", lty = 2) +
  geom_text(aes(label=word),check_overlap = TRUE)

ggplot(props, aes(x=n_M, y=n_F))+ geom_point(alpha=0.1)+
  geom_abline(color = "gray40", lty = 2) +
  geom_text(aes(label=word),check_overlap = TRUE) +
  xlim(-1,30) + ylim(-1,30)

seleciona <- pala %>% arrange(desc(n)) %>% filter(n>8)

props <- text %>% filter(word %in% seleciona$word) %>%
  count(gender, word) %>%
  group_by(gender) %>%
  mutate(proportion = n / sum(n)) %>% 
  pivot_wider(names_from = gender, values_from = c(proportion,n))


test <- props %>% arrange(desc(proportion_F), desc(proportion_M)) %>%
  mutate(ntot = n_F + n_M) %>%
  mutate(word = fct_reorder(word,(ntot),max))
test$proportion_F <- test$proportion_F*-1

test <- test [,1:3] %>% 
  pivot_longer(2:3,names_to = "gender", values_to ="proportion")

test %>% filter(!word %in% c("animal", "behavior", "opilion", "role",
                                "science", "sexual","nework")) %>%
ggplot(aes(x=proportion, y=word,fill=gender)) +
  geom_col()+ ylab("") + xlab("Proportion")+
  scale_fill_manual(name="gender", values=c("#6D57CF","#FCA532"),
                    labels=c("F", "M"))+
   geom_vline(xintercept = c(-0.1,-0.05,-0.02,0,0.02,0.05,0.10),
              linetype="dotted",
             col="darkgray") +
  scale_x_continuous(breaks=c(-0.1,-0.05,0,0.05,0.10),
                     labels = c(0.10,0.05,0,0.05,0.10))

ggsave("figures/title_wordFrequency.jpeg", units="in", width=7, height=7, dpi=300)

word cloud

textplot_wordcloud(x=dfm(tokens(text$word)))

par(mfrow=c(1,2))
textplot_wordcloud(x=dfm(tokens(text$word[text$gender=="F"])),
                   col="#6D57CF")
par(new=T)
textplot_wordcloud(x=dfm(tokens(text$word[text$gender=="M"])),
                   col="#FCA532")

TF IDF

text_id <- text %>% count(gender, word) %>% 
  bind_tf_idf(word, gender, n) %>%
  arrange(desc(tf_idf))
#text_id
text_id$word <- as.factor(text_id$word)
text_id %>%
  group_by(gender) %>% 
  arrange(desc(tf_idf)) %>% 
  top_n(5, tf_idf) %>%  
  ggplot(aes(x = tf_idf, y = reorder(word, tf_idf), fill = gender)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~gender, scales = "free") +
  theme_minimal()

N-GRAMS

bigrams <- tit %>%
  unnest_tokens(bigram, title_english, token = "ngrams", n = 2)

excluindo stopwords

bigrams <- bigrams %>% 
  separate(col = bigram,
           into = c("word1", "word2"),
           sep = " ",
           remove = FALSE)
bigrams_stop <- bigrams %>%
  filter(!word1 %in% stop_words$word & !word2 %in% stop_words$word)
bigrams_stop %>% 
  count(gender,bigram, sort = TRUE)
test <- bigrams_stop %>% 
  count(gender,bigram, sort = TRUE) %>% 
  filter(n > 1) %>%
  mutate(n2 = n)

test$n2[test$gender == "F"] <- test$n2[test$gender == "F"]*-1
test$bigram = fct_reorder(test$bigram, test$n2,min)

ggplot(test, aes(x=n2, y=fct_rev(bigram), fill=gender)) + geom_col()

bigrams_stop %>% 
  count(gender,bigram, sort = TRUE) %>% 
  filter(n > 1) %>% 
  ggplot(aes(x = reorder(bigram, n),
             y = n)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "frequency", title = "Most frequent bigrams") +
  coord_flip() +
  facet_wrap(~gender) +
  theme_minimal()

To find bigrams that contain specific words, we can use filter():

bieco <- bigrams_stop %>% group_by(gender) %>%
  filter(str_detect(bigram, "ecolog")) 
bieco %>% 
  count(gender,bigram, sort=T) %>%
  pivot_wider(names_from = gender, values_from = n)
## # A tibble: 35 × 3
##    bigram                      M     F
##    <chr>                   <int> <int>
##  1 behavioral ecology          3    NA
##  2 ecological approach        NA     2
##  3 landscape ecology           2     2
##  4 ecological interactions     2    NA
##  5 ecological niche            2    NA
##  6 ecological analysis        NA     1
##  7 ecological context         NA     1
##  8 ecological processes       NA     1
##  9 ethno ecology              NA     1
## 10 human ecology              NA     1
## # 
 with 25 more rows
bigrams_stop %>% group_by(gender) %>%
  filter(str_detect(bigram, "evolu")) %>% 
  distinct(bigram) %>% 
  count(gender,bigram, sort=T) %>%
  pivot_wider(names_from = gender, values_from = n)
## # A tibble: 14 × 3
##    bigram                        F     M
##    <chr>                     <int> <int>
##  1 convergent evolution          1    NA
##  2 evolutionary constraint       1    NA
##  3 colors evolution             NA     1
##  4 ecology evolution            NA     1
##  5 evolutionary biology         NA     1
##  6 evolutionary convergences    NA     1
##  7 evolutionary ecology         NA     1
##  8 evolutionary game            NA     1
##  9 evolutionary innovation      NA     1
## 10 evolutionary patterns        NA     1
## 11 evolutionary radiation       NA     1
## 12 evolutionary trajectory      NA     1
## 13 guilds evolution             NA     1
## 14 micro evolution              NA     1

quase nao tem mulher que fala de evolução

bigrams_stop %>% group_by(gender) %>%
  filter(str_detect(bigram, "forest")) %>% 
  distinct(bigram) %>% 
  count(gender,bigram, sort=T) %>%
  pivot_wider(names_from = gender, values_from = n)
## # A tibble: 30 × 3
##    bigram               F     M
##    <chr>            <int> <int>
##  1 atlantic forest      1     1
##  2 forest bird          1     1
##  3 forest birds         1    NA
##  4 forest corridors     1     1
##  5 forest cover         1    NA
##  6 forest field         1    NA
##  7 forest forests       1    NA
##  8 forest landscape     1    NA
##  9 forest loss          1    NA
## 10 forest products      1    NA
## # 
 with 20 more rows

wordcloud bigram ecology

bieco <- bigrams_stop %>% group_by(gender) %>%
  filter(str_detect(bigram, "ecolog")) 
bieco$word1[bieco$word1 == "ecological"] <- "ecology"
bieco$word2[bieco$word2 == "ecological"] <- "ecology"
bieco$bigram <- paste(bieco$word1,bieco$word2)

par(mfrow=c(1,2))
textplot_wordcloud(x=dfm(tokens(bieco$bigram[bieco$gender=="F"])), min_count = 1,
                   col="#6D57CF")
par(new=T)
textplot_wordcloud(x=dfm(tokens(bieco$bigram[bieco$gender=="M"])), min_count = 1,
                   col="#FCA532")

bievo <- bigrams_stop %>% group_by(gender) %>%
  filter(str_detect(bigram, "evol")) 
bievo$word1[bievo$word1 == "evolutionary"] <- "evolution"
bievo$word2[bievo$word2 == "evolutionary"] <- "evolution"
bievo$bigram <- paste(bievo$word1,bievo$word2)

par(mfrow=c(1,2))
textplot_wordcloud(x=dfm(tokens(bievo$bigram[bievo$gender=="F"])), min_count = 1,
                   col="#6D57CF", rotation=0)
par(new=T)
textplot_wordcloud(x=dfm(tokens(bievo$bigram[bievo$gender=="M"])), min_count = 1,
                   col="#FCA532", rotation=0)

TF_IDF bigram

bigram_tfidf <- bigrams_stop %>% 
  count(gender, bigram) %>% 
  bind_tf_idf(bigram, gender, n)
# bigram_tfidf %>% 
#   arrange(desc(tf_idf))
bigram_tfidf %>%
  group_by(gender) %>%
  slice_max(tf_idf, n = 3) %>%
  ungroup() %>%
  ggplot() +
  aes(x = tf_idf, 
      y = fct_reorder(bigram, tf_idf), 
      fill = gender) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ gender, scales = "free") +
  labs(x = "tf-idf", y = NULL) +
  theme_minimal()

alice_graph <- bigrams_stop %>% 
  count(word1, word2) %>% # we need the words separated for this graph
  filter(n > 1) %>% 
  graph_from_data_frame()

set.seed(2021)
ggraph(alice_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), 
                 vjust = 1, hjust = 1)

alice_graph <- bieco %>% filter(gender=="F") %>% ungroup() %>%
  count(word1, word2) # we need the words separated for this graph

alice_graph$word1[alice_graph$word1=="ecological"] <- "ecology"
alice_graph$word2[alice_graph$word2=="ecological"] <- "ecology"

alice_graph <- alice_graph %>%  
  graph_from_data_frame()

set.seed(2021)
ggraph(alice_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), 
                 vjust = 1, hjust = 0.5)

alice_graph <- bieco %>% filter(gender=="M") %>% ungroup() %>%
  count(word1, word2) # we need the words separated for this graph
alice_graph$word1[alice_graph$word1=="ecological"] <- "ecology"
alice_graph$word2[alice_graph$word2=="ecological"] <- "ecology"

alice_graph <- alice_graph %>%  
  graph_from_data_frame()

set.seed(2022)
ggraph(alice_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), 
                 vjust = 1, hjust = 0.5)

alice_graph <- bigrams_stop %>% group_by(gender) %>%
  count(word1, word2) %>% # we need the words separated for this graph
  filter(n > 1) %>% 
  graph_from_data_frame()

set.seed(2021)
ggraph(alice_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), 
                 vjust = 1, hjust = 1)

We get a sense of which words occur together, but the graph could definitely look prettier and it’s unclear which word occurs first: is it “rose tree” or “tree rose”, for example?

We’ll create an object called “a” that saves an arrow shape:

a <- grid::arrow(type = "closed", length = unit(.15, "inches"))

This way, we can indicate how the words in the bigrams are ordered.

Nicer graph:

ggraph(alice_graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = 1), #n # the links are more transparent if the bigram is rare
                 show.legend = FALSE,
                 arrow = a, end_cap = circle(.03, 'inches')) + #adding the arrows, making sure they don't touch the node
  geom_node_point(color = "#34013f", size = 3) + # larger, purple nodes
  geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
  theme_void() +
  labs(title = 'Bigrams (two-word combinations)"')

Abstract + title

data <- read.table("data/presentations_PPGE_2008-2019.csv", sep=",",
                   header=T, as.is=T)
data$date <- dmy(data$date)
data$year <- year(data$date) 
#skimr::skim(data)

Excluding special events as round tables and discussions not related to a project or study presented by someone.

Using abstracts in portuguese

IDs <- c(154, 250, 211, 289)
data <- data %>% filter(!id %in% IDs) %>% filter(!is.na(abstract_original),
                                                 abstract_language=="port") 
table(data$gender)
## 
##   F   M 
##  86 101

Tidytext

tit <- data %>% dplyr::select(id,gender,position_cat, audience_n,
                             abstract_original, title_original) %>%
  mutate(text = paste(title_original, abstract_original))

text_tok <- tit %>% unnest_tokens(output=word,
                                   input=text)

stop_w <- tibble(word = stopwords("pt"))

#retirar do corpus as stopwords
text <- text_tok %>% 
  anti_join(stop_w, by="word")  %>% 
  filter(!word %in% c("Ă©", "sobre", "ser"))
pala <- text %>%
  count(word) 

palavra mais comuns

text %>%
  count(word, sort = TRUE) %>% 
  filter(n>8)%>%
  kable()
word n
espécies 224
machos 78
diferentes 75
ĂĄreas 70
podem 66
padrÔes 64
diversidade 61
ecologia 61
estudo 60
estudos 58
paisagem 57
pode 54
espécie 53
evolução 50
habitat 50
resultados 50
comunidades 49
florestas 49
processos 48
grande 47
modelo 47
maior 46
dados 45
populaçÔes 45
brasil 44
ambientais 43
palestra 43
plantas 43
além 42
efeitos 42
estrutura 41
mudanças 41
uso 41
indivĂ­duos 40
interaçÔes 40
recursos 40
trabalho 40
paisagens 39
atlĂąntica 38
conservação 38
comportamento 37
durante 37
seleção 37
ainda 36
fatores 36
forma 36
abelhas 35
mata 35
naturais 35
distribuição 34
ecolĂłgicas 33
aves 31
bem 31
disso 31
extinção 31
importante 31
informaçÔes 31
modelos 31
nesta 31
animais 30
longo 30
natural 30
redes 30
dinĂąmica 29
fĂȘmeas 29
insetos 29
ĂĄrea 28
comunidade 28
entender 28
florestais 28
outros 28
papel 28
quais 28
relação 28
abordagem 27
cada 27
estratégias 27
fragmentos 27
hipĂłtese 27
paulo 27
risco 27
sistema 27
variação 27
ambientes 26
campo 26
dois 26
genética 26
mecanismos 26
objetivo 26
onde 26
parte 26
pesquisa 26
quanto 26
anĂĄlise 25
biodiversidade 25
caracterĂ­sticas 25
conhecimento 25
ecolĂłgicos 25
ecossistemas 25
importĂąncia 25
neste 25
regiĂŁo 25
alguns 24
efeito 24
processo 24
sendo 24
sexual 24
tempo 24
grupo 23
modelagem 23
presença 23
principais 23
qualidade 23
sistemas 23
anĂĄlises 22
anos 22
desenvolvimento 22
dessas 22
dispersĂŁo 22
diversificação 22
entanto 22
importantes 22
manejo 22
perda 22
produção 22
protegidas 22
sido 22
1 21
apesar 21
através 21
condiçÔes 21
deve 21
duas 21
locais 21
meio 21
população 21
vegetação 21
2 20
alta 20
ambiente 20
benefĂ­cios 20
biologia 20
ciĂȘncia 20
possuem 20
pouco 20
sobrevivĂȘncia 20
variaçÔes 20
acasalamento 19
ambiental 19
apresentar 19
conectividade 19
disponibilidade 19
diversos 19
elementos 19
grupos 19
muitas 19
padrĂŁo 19
principalmente 19
serviços 19
sociais 19
sucesso 19
tamanho 19
tanto 19
trĂȘs 19
assim 18
biomassa 18
campos 18
caracteres 18
cobertura 18
fauna 18
florestal 18
formigas 18
fragmentadas 18
interação 18
maioria 18
of 18
projeto 18
tĂȘm 18
utilizando 18
animal 17
ĂĄrvores 17
competição 17
desta 17
estado 17
evolutiva 17
nessa 17
novas 17
organismos 17
partir 17
possĂ­vel 17
restauração 17
tropicais 17
vida 17
algumas 16
apresentados 16
capacidade 16
cerrado 16
climĂĄticas 16
consequĂȘncias 16
custos 16
dentro 16
entretanto 16
estratégia 16
floresta 16
funcional 16
fundamental 16
histĂłria 16
medidas 16
menos 16
mutualismos 16
norte 16
porém 16
regiÔes 16
relacionados 16
tais 16
terra 16
tipo 16
abordagens 15
apresentarei 15
atividades 15
avaliar 15
clima 15
comportamentais 15
construção 15
cuidado 15
dessa 15
desses 15
deste 15
ecolĂłgica 15
escala 15
grandes 15
neotropicais 15
nĂ­vel 15
polinização 15
recentes 15
rio 15
ĂĄgua 14
apenas 14
apĂłs 14
ciĂȘncias 14
composição 14
enquanto 14
espacial 14
formação 14
mundo 14
nesse 14
populacional 14
serem 14
sob 14
trabalhos 14
vou 14
3 13
abundĂąncia 13
açÔes 13
apresentação 13
apresentam 13
aspectos 13
baixa 13
cerca 13
complexos 13
corais 13
corredores 13
defesa 13
desenvolvidos 13
devido 13
doutorado 13
ecolĂłgico 13
entendimento 13
escalas 13
especiação 13
evolutivos 13
exemplo 13
facilitação 13
ferramenta 13
impactos 13
in 13
influĂȘncia 13
local 13
matriz 13
menor 13
natureza 13
nĂșmero 13
polĂ­ticas 13
presente 13
principal 13
quantidade 13
relaçÔes 13
reprodutivo 13
respostas 13
riqueza 13
sementes 13
soja 13
ter 13
variĂĄveis 13
vez 13
adaptaçÔes 12
alimentares 12
alunos 12
anfĂ­bios 12
anuros 12
apresenta 12
associação 12
associadas 12
aumentar 12
carbono 12
cientĂ­ficos 12
comportamental 12
consumo 12
contra 12
desde 12
desse 12
dimorfismo 12
espaciais 12
evidĂȘncias 12
famĂ­lia 12
fenotĂ­pica 12
ferramentas 12
fragmentação 12
geogrĂĄfica 12
ilhas 12
literatura 12
ocorrĂȘncia 12
organização 12
paternal 12
perspectivas 12
teoria 12
todo 12
tradicionais 12
alimentação 11
amazĂŽnia 11
amplamente 11
atributos 11
atual 11
atualmente 11
bandos 11
biolĂłgica 11
caso 11
cientĂ­fico 11
complexo 11
contexto 11
desconexĂŁo 11
dieta 11
disciplina 11
diversas 11
ecossistĂȘmicos 11
eficiĂȘncia 11
estudar 11
evolutivas 11
expansĂŁo 11
fluxo 11
humanos 11
indicam 11
manguezais 11
mistos 11
porque 11
prĂĄticas 11
presentes 11
probabilidade 11
prole 11
reprodução 11
resposta 11
territĂłrio 11
usando 11
valor 11
afetar 10
ano 10
biolĂłgicas 10
conjunto 10
cĂłpulas 10
décadas 10
declĂ­nio 10
diplĂłides 10
discutir 10
distĂąncia 10
estar 10
exemplos 10
florais 10
humana 10
intra 10
isolamento 10
linhagens 10
mamĂ­feros 10
maneira 10
manter 10
matrizes 10
mestrado 10
mostrar 10
nativas 10
neotropical 10
ocorre 10
peixes 10
planta 10
portanto 10
possĂ­veis 10
problemas 10
projetos 10
questÔes 10
similares 10
solo 10
sul 10
sustentĂĄvel 10
unidades 10
visĂŁo 10
acesso 9
alimento 9
and 9
aplicaçÔes 9
associados 9
avaliação 9
balanço 9
capazes 9
central 9
cientĂ­fica 9
corpo 9
curso 9
destas 9
embora 9
ensino 9
frutos 9
funcionamento 9
gĂȘnico 9
hipĂłteses 9
hoje 9
i.e 9
identificar 9
ilha 9
impacto 9
implicaçÔes 9
inter 9
irei 9
modo 9
ninhos 9
nordeste 9
ovos 9
parĂąmetros 9
permite 9
pessoas 9
planejamento 9
polinizadores 9
pĂłs 9
poucas 9
predação 9
predadores 9
realizados 9
recentemente 9
representam 9
reprodutiva 9
revista 9
serviço 9
social 9
sociedade 9
sĂłcio 9
sub 9
sugerem 9
temperatura 9
testamos 9
testar 9
tipos 9
Ășltimas 9
utilizadas 9
props <- text %>%
  count(gender, word) %>%
  #filter(n>1) %>% # removendo word ditas apenas uma vez
  group_by(gender) %>%
  mutate(proportion = n / sum(n)) %>% 
  pivot_wider(names_from = gender, values_from = c(proportion,n))
library(scales)
ggplot(props, aes(x=proportion_M,, y=proportion_F),
       color=abs(proportion_F-proportion_M)) + 
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label=word),check_overlap = TRUE,vjust = 1.5) +
  scale_x_log10(labels = percent_format(), limits=c(0.0005,0.03)) +
  scale_y_log10(labels = percent_format(),limits=c(0.0005,0.03)) +
  scale_color_gradient(low = "blue", high = "red") 

ggplot(props, aes(x=proportion_M, y=proportion_F))+ geom_point(alpha=0.1)+
  geom_abline(color = "gray40", lty = 2) +
  geom_text(aes(label=word),check_overlap = TRUE)

ggplot(props, aes(x=n_M, y=n_F))+ geom_point(alpha=0.1)+
  geom_abline(color = "gray40", lty = 2) +
  geom_text(aes(label=word),check_overlap = TRUE) +
  xlim(-1,30) + ylim(-1,30)

seleciona <- pala %>% arrange(desc(n)) %>% filter(n>8)

props <- text %>% filter(word %in% seleciona$word) %>%
  count(gender, word) %>%
  group_by(gender) %>%
  mutate(proportion = n / sum(n)) %>% 
  pivot_wider(names_from = gender, values_from = c(proportion,n))


test <- props %>% arrange(desc(proportion_F), desc(proportion_M)) %>%
  mutate(ntot = n_F + n_M) %>%
  mutate(word = fct_reorder(word,(ntot),max))
test$proportion_F <- test$proportion_F*-1

test <- test [,1:3] %>% 
  pivot_longer(2:3,names_to = "gender", values_to ="proportion")

test %>%
ggplot(aes(x=proportion, y=word,fill=gender)) +
  geom_col()+ ylab("") + xlab("Proportion")+
  scale_fill_manual(name="gender", values=c("#6D57CF","#FCA532"),
                    labels=c("F", "M"))

ggsave("figures/title_wordFrequency.jpeg", units="in", width=7, height=7, dpi=300)

word cloud

textplot_wordcloud(x=dfm(tokens(text$word)))

par(mfrow=c(1,2))
textplot_wordcloud(x=dfm(tokens(text$word[text$gender=="F"])),
                   col="#6D57CF")
par(new=T)
textplot_wordcloud(x=dfm(tokens(text$word[text$gender=="M"])),
                   col="#FCA532")

TF IDF

text_id <- text %>% count(gender, word) %>% 
  bind_tf_idf(word, gender, n) %>%
  arrange(desc(tf_idf))
#text_id
text_id$word <- as.factor(text_id$word)
text_id %>%
  group_by(gender) %>% 
  arrange(desc(tf_idf)) %>% 
  top_n(5, tf_idf) %>%  
  ggplot(aes(x = tf_idf, y = reorder(word, tf_idf), fill = gender)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~gender, scales = "free") +
  theme_minimal()